Stat 331 - Final

Lillie Allen, Nick Bukovec, Nikhil Koganti, Madeline Willett

3/2/2022

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(prettydoc)
library(here)
## here() starts at /Users/nick/Desktop/Code/stat331/stat331-final
library(broom)
library(ggtext)
library(gganimate)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(knitr)
library(DT)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Load Data

oil <- read_csv(here::here("oil_consumption_per_cap.csv"))
## Rows: 79 Columns: 56
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): country
## dbl (55): 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sdi <- read_csv(here::here("sdi.csv"))
## Rows: 164 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): country
## dbl (30): 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

oil_clean <- oil %>%
  # Coerce types
  transmute(
    country = as.factor(country),
    across(`1965`:`2019`, as.numeric)
    ) %>% 
  # Pivot 
  pivot_longer(
    cols = `1965`:`2019`, 
    names_to = "year",
    values_to = "Oil Consumption per Person (tonnes/person)",
    )

sdi_clean <- sdi %>% 
  # Coerce types
  transmute(
    country = as.factor(country),
    across(`1990`:`2019`, as.numeric)
    ) %>% 
  # Pivot
  pivot_longer(
    cols = `1990`:`2019`, 
    names_to = "year",
    values_to = "Sustainable Development Index",
    ) 

dataset <- sdi_clean %>% 
  # Join data by year and country
  inner_join(oil_clean, by = c("year", "country")) %>% 
  # Drop any missing data
  mutate(year = as.integer(year)) %>% 
  drop_na()

Data Visualization

# See Source 1
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#0072B2",
               "#D55E00")

main_vis <- dataset %>%
  ggplot(mapping = aes(
    x = `Oil Consumption per Person (tonnes/person)`, 
    y = `Sustainable Development Index`,
    color = case_when(
      country == "Singapore" ~ "Singapore",
      country == "Bangladesh" ~ "Bangladesh",
      country == "Sri Lanka" ~ "Sri Lanka",
      country == "Uzbekistan" ~ "Uzbekistan",
      country == "United Arab Emirates" ~ "United Arab Emirates",
      country == "Kuwait" ~ "Kuwait",
      TRUE ~ "All Other Countries"
    ),
    size = case_when(
      country %in% c("Singapore", "Bangladesh", "Sri Lanka", "Uzbekistan", 
                     "United Arab Emirates", "Kuwait") ~ 3,
      TRUE ~ 0.5
    ) ,
    alpha = case_when(
      country %in% c("Singapore", "Bangladesh", "Sri Lanka", "Uzbekistan", 
                     "United Arab Emirates", "Kuwait") ~ 0.95,
      TRUE ~ 0.94
    ),
    text = glue::glue(
      "Year: {year}
      Country: {country}
      Oil Consumption per Person: {`Oil Consumption per Person (tonnes/person)`} tonnes/person
      Sustainable Development Index: {`Sustainable Development Index`}
      "
    )
    )) + 
  geom_jitter() +
  # See Source 2
  labs(
    title = " How Oil Consumption Does Not Allow for Sustainable Development 
<span style='font-size:11pt'>Countries such as<span style='color:#E69F00;'> Bangladesh</span>, <span style='color:#CC79A7;'>Sri Lanka</span>, and <span style='color:#D55E00;'>Uzbekistan</span> shine while other countries like <br /><span style='color:#009E73;'>Singapore</span>, the <span style='color:#0072B2;'>United Arab Emirates</span>, and <span style='color:#56B4E9;'>Kuwait</span> guzzle oil without benefiting their citizens. </span>",
    x = "Oil Consumption per Person (tonnes/person)",
    y = "Sustainable Development Index",
    caption = "Data from Gapminder",
  ) +
  scale_y_continuous(limits = c(0, 110)) +
  scale_x_sqrt() +
  scale_color_manual(values = cbPalette, name = "") +
  scale_size_continuous(range = c(0.5, 1.3), name = "") +
  scale_alpha_continuous(range = c(0.45, 0.7), name = "Country") +
  # See Source 3
  theme(
    text = element_text(family = "Helvetica"),
    plot.caption = element_text(hjust = 1, 
                                color = "blue", 
                                face = "italic"),
    plot.title.position = "plot",
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    legend.position = "none")

ggplotly(main_vis, tooltip = "text") %>% 
  layout(title = list(pad = list(l = 10 )))

Visualization Over Time

# See Source 1
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#0072B2",
               "#D55E00")
anim_plot <- dataset %>%
  ggplot(mapping = aes(
    x = `Oil Consumption per Person (tonnes/person)`, 
    y = `Sustainable Development Index`,
    size = `Oil Consumption per Person (tonnes/person)`,
    color = country
  )) +
  geom_jitter(
    alpha = 0.80,
    # size = 0.65
  ) +
  # See Source 2
  labs(
    title = "**Visualizing Oil Consumption and Sustainable Development ({frame_time})** 
    <br />
    <span style='font-size:11pt'>As countries consume more oil, the benefits
    to the lives of their citizens are outweighed <br /> by their carbon 
    footprints. However, as time has passed more countries have lowered their <br/>oil consumption while
    maintaining their sustainable development index.
    </span>",
    x = "Oil Consumption per Person (tonnes/person)",
    y = "Sustainable Development Index",
    caption = "Data from Gapminder",
  ) +
  scale_y_continuous(limits = c(0, 100)) +
  scale_x_sqrt() +
  # scale_color_manual(values = cbPalette, name = "") +
  # scale_colour_brewer() +
  scale_size_continuous(range = c(0.4, 10), name = "") +
  # See Source 3
  theme(
    text = element_text(family = "Helvetica"),
    plot.title = element_markdown(lineheight = 1.1),
    plot.caption = element_text(hjust = 1, 
                                color = "blue", 
                                face = "italic"),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black"),
    legend.position = "none"
    ) +
  # https://gganimate.com/
  transition_time(year) +
  ease_aes(default = 'cubic-in-out') +
  # https://gganimate.com/articles/gganimate.html#labeling-1
  enter_fade() +
  exit_fade()
  
animate(anim_plot,
        duration = 20,
        fps = 10)

Linear Regression

dataset_lm <- dataset %>%
  lm(`Sustainable Development Index` ~ `Oil Consumption per Person (tonnes/person)`, data = .)

\[\widehat{SDI} = 67.32 - 7.714 \times (Oil Consumption)\]

The intercept, 67.32, is the average Sustainable Development Index y = SDI for those countries where the average Oil Consumption per Person is 0. It has no practical interpretation here, since observing an SDI of 0 is impossible. The slope for Oil Consumption per Person, -7.714, summarizes the relationship between the SDI and Oil Consumption per Person variables. The sign is negative, suggesting a negative relationship between these two variables, meaning countries with higher average Oil Consumption per Person tend to have lower Sustainable Development Indexes. For every increase of 1 unit in Oil Consumption per Person, there is an associated decrease of, on average, 7.714 units of SDI.

lm_tibble <- broom::augment(dataset_lm) %>% 
  summarise('Response Variance' = var(`Sustainable Development Index`),
            'Fitted Value Variance' = var(.fitted),
            'Residual Variance' = var(.resid)) %>% 
  mutate('Proportion of Variance Accounted for in Model' = `Fitted Value Variance` / (`Response Variance` + `Fitted Value Variance` + `Residual Variance`))

Table Using kable()

lm_tibble %>%
  kbl() %>%
  kable_styling()
Response Variance Fitted Value Variance Residual Variance Proportion of Variance Accounted for in Model
374.7564 150.5912 224.1652 0.2009188

Table Using kableExtra

lm_tibble %>%
  kbl(caption = "Table 1: Investigating Whether our Regression does a Good Job Accounting for Variabilty in Response Values", centering = TRUE) %>%
  kable_classic(full_width = F, 
                position = "center", 
                html_font = "Georgia", 
                font_size = 15) %>%
  row_spec(0, bold = T)
Table 1: Investigating Whether our Regression does a Good Job Accounting for Variabilty in Response Values
Response Variance Fitted Value Variance Residual Variance Proportion of Variance Accounted for in Model
374.7564 150.5912 224.1652 0.2009188
 # kable_styling(bootstrap_options = c("condensed"))

# https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html

Discussion

The variance in the response variable, 374.7564, tells us how much Countries differ in Sustainable Development Index. The variance in the fitted values, 150.5912, from our regression model tells us how much the fitted values from our linear regression model vary. The variance of the residuals, 224.1652, tells us how much “the left-overs” from the model vary. The proportion of the variability in the response values that was accounted for by our regression model is 0.2009188; thus, our linear regression model explains about 20% of the variation in Sustainable Development Index. This suggests that our model does not tell us everything we need to know about Sustainable Development Index.

predict_sdi <- predict(dataset_lm)
oil_sigma <- sigma(dataset_lm)

noise <- function(predicted_vals, mean = 0, sd){
  predicted_vals + rnorm(length(predicted_vals),
            mean,
            sd)
}

expected_data <- tibble(predicted_values = noise(predict_sdi, 
                                          sd = oil_sigma
                                          )
                   )
expected_data <- dataset %>% 
  filter(!is.na(`Oil Consumption per Person (tonnes/person)`), 
         !is.na(`Sustainable Development Index`)
         ) %>% 
  select(`Oil Consumption per Person (tonnes/person)`, `Sustainable Development Index`) %>% 
  bind_cols(expected_data)
pred <- expected_data %>% 
  ggplot(aes(x = `Oil Consumption per Person (tonnes/person)`,
             y = predicted_values)) +
  geom_point() +
  labs(x = "Oil Consumption (tonnes/person)",
       y = "Simulated SDI")

obs <- dataset %>% 
  ggplot(aes(x = `Oil Consumption per Person (tonnes/person)`,
             y = `Sustainable Development Index`)) + 
  geom_point() + 
  labs(x = "Oil Consumption (tonnes/person)",
       y = "Observed SDI")


gridExtra::grid.arrange(pred, obs, nrow = 1)

expected_data %>%
  lm(`Sustainable Development Index` ~ predicted_values, data = .) %>%
  glance() %>%
  select(r.squared) %>%
  pull()
## [1] 0.1637061

The observed data appears to be more extreme than the simulated data. It also seems to be scattered a lot more than the simulated data. The r squared value of the simulated data is 17% which is less than the 20% for the observed data. Thus, our linear model explains less simulated SDI variance compared to the observed SDI variance.

Datatable Using DT

datatable(dataset,
          caption = htmltools::tags$caption(
            style = 'caption-side: bottom; text-align: center;',
            'Table 2: ', htmltools::em("Exploring Countries' SDI and Oil Consumption Over Time")  #This will probably end up being Table 1 so we will need to edit
            ), 
          rownames = FALSE,
          colnames = c("Country" = "country", 
                       "Year" = "year"),
          filter = "top",
          clas = "hover cell-border stripe")
# https://www.r-bloggers.com/2021/05/datatable-editor-dt-package-in-r/
# https://rstudio.github.io/DT/